library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v readr 1.3.1
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 0.8.3 v stringr 1.4.0
## v ggplot2 3.2.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(corrplot)
## corrplot 0.84 loaded
library(DataExplorer)
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(investr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(leaps)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
attritdf<-read.csv(file="/Users/malco/Documents/MDS/MDS_6306_DDS/website1/MCCWebsite/data/CaseStudy2-data.csv",header=TRUE)
noattritdf <- read.csv(file="/Users/malco/Documents/MDS/MDS_6306_DDS/website1/MCCWebsite/data/CaseStudy2CompSetNoAttrition.csv",header=TRUE)
nosalarydf <- read.csv(file="/Users/malco/Documents/MDS/MDS_6306_DDS/website1/MCCWebsite/data/CaseStudy2CompSetNoSalary.csv",header=TRUE)
attritdf$AgeGrp<-cut(attritdf$Age,breaks = c(17,24,34,44,54,Inf),labels=c('18-24','25-34','35-44','46-54','55+'))
noattritdf$AgeGrp<-cut(noattritdf$Age,breaks = c(17,24,34,44,54,Inf),labels=c('18-24','25-34','35-44','46-54','55+'))
attritdf <- attritdf %>% mutate(AttrNum = case_when(
Attrition == "Yes" ~ 1,
Attrition == "No" ~ 0
))
attritdf <- attritdf %>% mutate(GenNum = case_when(
Gender == "Male" ~ 1,
Gender == "Female" ~ 0
))
noattritdf <- noattritdf %>% mutate(GenNum = case_when(
Gender == "Male" ~ 1,
Gender == "Female" ~ 0
))
attritdf <- attritdf %>% mutate(TravLevel = case_when(
BusinessTravel == "Non-Travel" ~ 0,
BusinessTravel == "Travel_Rarely" ~ 1,
BusinessTravel == "Travel_Frequently" ~ 2
))
noattritdf <- noattritdf %>% mutate(TravLevel = case_when(
BusinessTravel == "Non-Travel" ~ 0,
BusinessTravel == "Travel_Rarely" ~ 1,
BusinessTravel == "Travel_Frequently" ~ 2
))
attritdf <- attritdf %>% mutate(OTNum = case_when(
OverTime == "Yes" ~ 1,
OverTime == "No" ~0
))
noattritdf <- noattritdf %>% mutate(OTNum = case_when(
OverTime == "Yes" ~ 1,
OverTime == "No" ~0
))
attritdf <- attritdf %>% mutate(MariStatNum = case_when(
MaritalStatus == "Single" ~ 0,
MaritalStatus == "Married" ~ 1,
MaritalStatus == "Divorced" ~ 2
))
noattritdf <- noattritdf %>% mutate(MariStatNum = case_when(
MaritalStatus == "Single" ~ 0,
MaritalStatus == "Married" ~ 1,
MaritalStatus == "Divorced" ~ 2
))
attritdf <- attritdf %>% mutate(EducNum = case_when(
EducationField == "Medical" ~ 0,
EducationField == "Life Sciences" ~ 1,
EducationField == "Marketing" ~ 2,
EducationField == "Technical Degree" ~ 3,
EducationField == "Human Resources" ~ 4,
EducationField == "Other" ~ 5
))
noattritdf <- noattritdf %>% mutate(EducNum = case_when(
EducationField == "Medical" ~ 0,
EducationField == "Life Sciences" ~ 1,
EducationField == "Marketing" ~ 2,
EducationField == "Technical Degree" ~ 3,
EducationField == "Human Resources" ~ 4,
EducationField == "Other" ~ 5
))
attritdf <- attritdf %>% mutate(JRoleNum = case_when(
JobRole == "Research Director" ~ 0,
JobRole == "Manufacturing Director" ~ 1,
JobRole == "Sales Executive" ~ 2,
JobRole == "Research Scientist" ~ 3,
JobRole == "Sales Representative" ~ 4,
JobRole == "Healthcare Representative" ~ 5,
JobRole == "Manager" ~ 6,
JobRole == "Laboratory Technician" ~ 7,
JobRole == "Human Resources" ~ 8
))
noattritdf <- noattritdf %>% mutate(JRoleNum = case_when(
JobRole == "Research Director" ~ 0,
JobRole == "Manufacturing Director" ~ 1,
JobRole == "Sales Executive" ~ 2,
JobRole == "Research Scientist" ~ 3,
JobRole == "Sales Representative" ~ 4,
JobRole == "Healthcare Representative" ~ 5,
JobRole == "Manager" ~ 6,
JobRole == "Laboratory Technician" ~ 7,
JobRole == "Human Resources" ~ 8
))
allnumeric<-attritdf%>%dplyr::select(Age,AgeGrp,AttrNum,TravLevel,DailyRate,DistanceFromHome,Education,EducNum,EnvironmentSatisfaction,GenNum,HourlyRate,JobInvolvement,JobLevel,JRoleNum,JobSatisfaction,MariStatNum,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OTNum,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager)
attritdf$JobInvolvement <- as.factor(attritdf$JobInvolvement)
noattritdf$JobInvolvement <- as.factor(noattritdf$JobInvolvement)
nosalarydf$JobInvolvement <- as.factor(nosalarydf$JobInvolvement)
attritdf$JobLevel <- as.factor(attritdf$JobLevel)
noattritdf$JobLevel <- as.factor(noattritdf$JobLevel)
nosalarydf$JobLevel <- as.factor(nosalarydf$JobLevel)
attritdf$JobSatisfaction <- as.factor(attritdf$JobSatisfaction)
noattritdf$JobSatisfaction <- as.factor(noattritdf$JobSatisfaction)
nosalarydf$JobSatisfaction <- as.factor(nosalarydf$JobSatisfaction)
attritdf$PerformanceRating <- as.factor(attritdf$PerformanceRating)
noattritdf$PerformanceRating <- as.factor(noattritdf$PerformanceRating)
nosalarydf$PerformanceRating <- as.factor(nosalarydf$PerformanceRating)
attritdf$RelationshipSatisfaction <- as.factor(attritdf$RelationshipSatisfaction)
noattritdf$RelationshipSatisfaction <- as.factor(noattritdf$RelationshipSatisfaction)
nosalarydf$RelationshipSatisfaction <- as.factor(nosalarydf$RelationshipSatisfaction)
attritdf$StockOptionLevel <- as.factor(attritdf$StockOptionLevel)
nosalarydf$StockOptionLevel <- as.factor(nosalarydf$StockOptionLevel)
noattritdf$StockOptionLevel <- as.factor(noattritdf$StockOptionLevel)
attritdf$TrainingTimesLastYear <- as.factor(attritdf$TrainingTimesLastYear)
noattritdf$TrainingTimesLastYear <- as.factor(noattritdf$TrainingTimesLastYear)
nosalarydf$TrainingTimesLastYear <- as.factor(nosalarydf$TrainingTimesLastYear)
attritdf$WorkLifeBalance <- as.factor(attritdf$WorkLifeBalance)
noattritdf$WorkLifeBalance <- as.factor(noattritdf$WorkLifeBalance)
nosalarydf$WorkLifeBalance <- as.factor(nosalarydf$WorkLifeBalance)
attritdf$Education <- as.factor(attritdf$Education)
noattritdf$Education <- as.factor(noattritdf$Education)
nosalarydf$Education <- as.factor(nosalarydf$Education)
attritdf$EnvironmentSatisfaction <- as.factor(attritdf$EnvironmentSatisfaction)
noattritdf$EnvironmentSatisfaction <- as.factor(noattritdf$EnvironmentSatisfaction)
nosalarydf$EnvironmentSatisfaction <- as.factor(nosalarydf$EnvironmentSatisfaction)
str(attritdf)
## 'data.frame': 870 obs. of 44 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : Factor w/ 5 levels "1","2","3","4",..: 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : Factor w/ 2 levels "3","4": 1 1 1 1 1 2 1 1 1 1 ...
## $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 2 1 1 3 1 3 1 4 2 2 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : Factor w/ 7 levels "0","1","2","3",..: 4 3 3 4 3 5 6 6 3 4 ...
## $ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
## $ AgeGrp : Factor w/ 5 levels "18-24","25-34",..: 2 3 3 2 1 2 3 3 2 2 ...
## $ AttrNum : num 0 0 0 0 0 0 0 0 0 0 ...
## $ GenNum : num 1 1 1 0 0 1 1 0 0 1 ...
## $ TravLevel : num 1 1 2 1 2 2 1 1 1 2 ...
## $ OTNum : num 0 0 0 0 1 0 1 1 1 0 ...
## $ MariStatNum : num 2 0 0 1 0 2 1 2 1 1 ...
## $ EducNum : num 1 0 1 2 3 1 0 1 1 3 ...
## $ JRoleNum : num 2 0 1 2 3 1 3 2 4 5 ...
names(attritdf)
## [1] "ID" "Age"
## [3] "Attrition" "BusinessTravel"
## [5] "DailyRate" "Department"
## [7] "DistanceFromHome" "Education"
## [9] "EducationField" "EmployeeCount"
## [11] "EmployeeNumber" "EnvironmentSatisfaction"
## [13] "Gender" "HourlyRate"
## [15] "JobInvolvement" "JobLevel"
## [17] "JobRole" "JobSatisfaction"
## [19] "MaritalStatus" "MonthlyIncome"
## [21] "MonthlyRate" "NumCompaniesWorked"
## [23] "Over18" "OverTime"
## [25] "PercentSalaryHike" "PerformanceRating"
## [27] "RelationshipSatisfaction" "StandardHours"
## [29] "StockOptionLevel" "TotalWorkingYears"
## [31] "TrainingTimesLastYear" "WorkLifeBalance"
## [33] "YearsAtCompany" "YearsInCurrentRole"
## [35] "YearsSinceLastPromotion" "YearsWithCurrManager"
## [37] "AgeGrp" "AttrNum"
## [39] "GenNum" "TravLevel"
## [41] "OTNum" "MariStatNum"
## [43] "EducNum" "JRoleNum"
describe(attritdf)
## attritdf
##
## 44 Variables 870 Observations
## ---------------------------------------------------------------------------
## ID
## n missing distinct Info Mean Gmd .05 .10
## 870 0 870 1 435.5 290.3 44.45 87.90
## .25 .50 .75 .90 .95
## 218.25 435.50 652.75 783.10 826.55
##
## lowest : 1 2 3 4 5, highest: 866 867 868 869 870
## ---------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 870 0 43 0.999 36.83 10.07 24 26
## .25 .50 .75 .90 .95
## 30 35 43 50 54
##
## lowest : 18 19 20 21 22, highest: 56 57 58 59 60
## ---------------------------------------------------------------------------
## Attrition
## n missing distinct
## 870 0 2
##
## Value No Yes
## Frequency 730 140
## Proportion 0.839 0.161
## ---------------------------------------------------------------------------
## BusinessTravel
## n missing distinct
## 870 0 3
##
## Value Non-Travel Travel_Frequently Travel_Rarely
## Frequency 94 158 618
## Proportion 0.108 0.182 0.710
## ---------------------------------------------------------------------------
## DailyRate
## n missing distinct Info Mean Gmd .05 .10
## 870 0 627 1 815.2 463.3 175.4 257.8
## .25 .50 .75 .90 .95
## 472.5 817.5 1165.8 1368.0 1436.7
##
## lowest : 103 111 117 119 120, highest: 1490 1495 1496 1498 1499
## ---------------------------------------------------------------------------
## Department
## n missing distinct
## 870 0 3
##
## Value Human Resources Research & Development
## Frequency 35 562
## Proportion 0.040 0.646
##
## Value Sales
## Frequency 273
## Proportion 0.314
## ---------------------------------------------------------------------------
## DistanceFromHome
## n missing distinct Info Mean Gmd .05 .10
## 870 0 29 0.993 9.339 8.843 1.0 1.0
## .25 .50 .75 .90 .95
## 2.0 7.0 14.0 23.1 26.0
##
## lowest : 1 2 3 4 5, highest: 25 26 27 28 29
## ---------------------------------------------------------------------------
## Education
## n missing distinct
## 870 0 5
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 98 182 324 240 26
## Proportion 0.113 0.209 0.372 0.276 0.030
## ---------------------------------------------------------------------------
## EducationField
## n missing distinct
## 870 0 6
##
## lowest : Human Resources Life Sciences Marketing Medical Other
## highest: Life Sciences Marketing Medical Other Technical Degree
##
## Value Human Resources Life Sciences Marketing
## Frequency 15 358 100
## Proportion 0.017 0.411 0.115
##
## Value Medical Other Technical Degree
## Frequency 270 52 75
## Proportion 0.310 0.060 0.086
## ---------------------------------------------------------------------------
## EmployeeCount
## n missing distinct Info Mean Gmd
## 870 0 1 0 1 0
##
## Value 1
## Frequency 870
## Proportion 1
## ---------------------------------------------------------------------------
## EmployeeNumber
## n missing distinct Info Mean Gmd .05 .10
## 870 0 870 1 1030 698.6 86.9 191.1
## .25 .50 .75 .90 .95
## 477.2 1039.0 1561.5 1856.2 1958.3
##
## lowest : 1 4 11 13 14, highest: 2041 2053 2056 2062 2064
## ---------------------------------------------------------------------------
## EnvironmentSatisfaction
## n missing distinct
## 870 0 4
##
## Value 1 2 3 4
## Frequency 172 178 258 262
## Proportion 0.198 0.205 0.297 0.301
## ---------------------------------------------------------------------------
## Gender
## n missing distinct
## 870 0 2
##
## Value Female Male
## Frequency 354 516
## Proportion 0.407 0.593
## ---------------------------------------------------------------------------
## HourlyRate
## n missing distinct Info Mean Gmd .05 .10
## 870 0 71 1 65.61 23.24 34 39
## .25 .50 .75 .90 .95
## 48 66 83 94 97
##
## lowest : 30 31 32 33 34, highest: 96 97 98 99 100
## ---------------------------------------------------------------------------
## JobInvolvement
## n missing distinct
## 870 0 4
##
## Value 1 2 3 4
## Frequency 47 228 514 81
## Proportion 0.054 0.262 0.591 0.093
## ---------------------------------------------------------------------------
## JobLevel
## n missing distinct
## 870 0 5
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 329 312 132 60 37
## Proportion 0.378 0.359 0.152 0.069 0.043
## ---------------------------------------------------------------------------
## JobRole
## n missing distinct
## 870 0 9
##
## lowest : Healthcare Representative Human Resources Laboratory Technician Manager Manufacturing Director
## highest: Manufacturing Director Research Director Research Scientist Sales Executive Sales Representative
##
## Healthcare Representative (76, 0.087), Human Resources (27, 0.031),
## Laboratory Technician (153, 0.176), Manager (51, 0.059), Manufacturing
## Director (87, 0.100), Research Director (51, 0.059), Research Scientist
## (172, 0.198), Sales Executive (200, 0.230), Sales Representative (53,
## 0.061)
## ---------------------------------------------------------------------------
## JobSatisfaction
## n missing distinct
## 870 0 4
##
## Value 1 2 3 4
## Frequency 179 166 254 271
## Proportion 0.206 0.191 0.292 0.311
## ---------------------------------------------------------------------------
## MaritalStatus
## n missing distinct
## 870 0 3
##
## Value Divorced Married Single
## Frequency 191 410 269
## Proportion 0.220 0.471 0.309
## ---------------------------------------------------------------------------
## MonthlyIncome
## n missing distinct Info Mean Gmd .05 .10
## 870 0 826 1 6390 4757 2088 2279
## .25 .50 .75 .90 .95
## 2840 4946 8182 13571 17165
##
## lowest : 1081 1091 1102 1118 1129, highest: 19845 19859 19926 19943 19999
## ---------------------------------------------------------------------------
## MonthlyRate
## n missing distinct Info Mean Gmd .05 .10
## 870 0 852 1 14326 8210 3456 4751
## .25 .50 .75 .90 .95
## 8092 14074 20456 24045 25541
##
## lowest : 2094 2104 2112 2125 2137, highest: 26862 26933 26959 26968 26997
## ---------------------------------------------------------------------------
## NumCompaniesWorked
## n missing distinct Info Mean Gmd .05 .10
## 870 0 10 0.945 2.728 2.683 0 0
## .25 .50 .75 .90 .95
## 1 2 4 7 8
##
## lowest : 0 1 2 3 4, highest: 5 6 7 8 9
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 111 320 74 91 85 43 39 46 28 33
## Proportion 0.128 0.368 0.085 0.105 0.098 0.049 0.045 0.053 0.032 0.038
## ---------------------------------------------------------------------------
## Over18
## n missing distinct value
## 870 0 1 Y
##
## Value Y
## Frequency 870
## Proportion 1
## ---------------------------------------------------------------------------
## OverTime
## n missing distinct
## 870 0 2
##
## Value No Yes
## Frequency 618 252
## Proportion 0.71 0.29
## ---------------------------------------------------------------------------
## PercentSalaryHike
## n missing distinct Info Mean Gmd .05 .10
## 870 0 15 0.988 15.2 4.047 11 11
## .25 .50 .75 .90 .95
## 12 14 18 21 22
##
## lowest : 11 12 13 14 15, highest: 21 22 23 24 25
##
## Value 11 12 13 14 15 16 17 18 19 20
## Frequency 126 119 123 120 54 43 56 57 40 27
## Proportion 0.145 0.137 0.141 0.138 0.062 0.049 0.064 0.066 0.046 0.031
##
## Value 21 22 23 24 25
## Frequency 33 30 17 14 11
## Proportion 0.038 0.034 0.020 0.016 0.013
## ---------------------------------------------------------------------------
## PerformanceRating
## n missing distinct
## 870 0 2
##
## Value 3 4
## Frequency 738 132
## Proportion 0.848 0.152
## ---------------------------------------------------------------------------
## RelationshipSatisfaction
## n missing distinct
## 870 0 4
##
## Value 1 2 3 4
## Frequency 174 171 261 264
## Proportion 0.200 0.197 0.300 0.303
## ---------------------------------------------------------------------------
## StandardHours
## n missing distinct Info Mean Gmd
## 870 0 1 0 80 0
##
## Value 80
## Frequency 870
## Proportion 1
## ---------------------------------------------------------------------------
## StockOptionLevel
## n missing distinct
## 870 0 4
##
## Value 0 1 2 3
## Frequency 379 355 81 55
## Proportion 0.436 0.408 0.093 0.063
## ---------------------------------------------------------------------------
## TotalWorkingYears
## n missing distinct Info Mean Gmd .05 .10
## 870 0 39 0.995 11.05 8.048 1 3
## .25 .50 .75 .90 .95
## 6 10 15 22 26
##
## lowest : 0 1 2 3 4, highest: 34 35 36 37 40
## ---------------------------------------------------------------------------
## TrainingTimesLastYear
## n missing distinct
## 870 0 7
##
## lowest : 0 1 2 3 4, highest: 2 3 4 5 6
##
## Value 0 1 2 3 4 5 6
## Frequency 30 39 309 308 73 75 36
## Proportion 0.034 0.045 0.355 0.354 0.084 0.086 0.041
## ---------------------------------------------------------------------------
## WorkLifeBalance
## n missing distinct
## 870 0 4
##
## Value 1 2 3 4
## Frequency 48 192 532 98
## Proportion 0.055 0.221 0.611 0.113
## ---------------------------------------------------------------------------
## YearsAtCompany
## n missing distinct Info Mean Gmd .05 .10
## 870 0 32 0.993 6.962 6.208 1 1
## .25 .50 .75 .90 .95
## 3 5 10 15 20
##
## lowest : 0 1 2 3 4, highest: 30 31 32 33 40
## ---------------------------------------------------------------------------
## YearsInCurrentRole
## n missing distinct Info Mean Gmd .05 .10
## 870 0 19 0.973 4.205 3.967 0 0
## .25 .50 .75 .90 .95
## 2 3 7 9 11
##
## lowest : 0 1 2 3 4, highest: 14 15 16 17 18
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 151 38 223 68 53 26 17 136 56 40
## Proportion 0.174 0.044 0.256 0.078 0.061 0.030 0.020 0.156 0.064 0.046
##
## Value 10 11 12 13 14 15 16 17 18
## Frequency 14 15 7 9 7 3 3 3 1
## Proportion 0.016 0.017 0.008 0.010 0.008 0.003 0.003 0.003 0.001
## ---------------------------------------------------------------------------
## YearsSinceLastPromotion
## n missing distinct Info Mean Gmd .05 .10
## 870 0 16 0.923 2.169 2.961 0 0
## .25 .50 .75 .90 .95
## 0 1 3 7 9
##
## lowest : 0 1 2 3 4, highest: 11 12 13 14 15
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 342 214 94 32 32 30 23 41 12 9
## Proportion 0.393 0.246 0.108 0.037 0.037 0.034 0.026 0.047 0.014 0.010
##
## Value 10 11 12 13 14 15
## Frequency 4 14 5 5 5 8
## Proportion 0.005 0.016 0.006 0.006 0.006 0.009
## ---------------------------------------------------------------------------
## YearsWithCurrManager
## n missing distinct Info Mean Gmd .05 .10
## 870 0 17 0.976 4.14 3.938 0 0
## .25 .50 .75 .90 .95
## 2 3 7 9 10
##
## lowest : 0 1 2 3 4, highest: 12 13 14 15 17
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 166 40 202 76 51 22 12 131 68 44
## Proportion 0.191 0.046 0.232 0.087 0.059 0.025 0.014 0.151 0.078 0.051
##
## Value 10 11 12 13 14 15 17
## Frequency 18 11 13 7 4 1 4
## Proportion 0.021 0.013 0.015 0.008 0.005 0.001 0.005
## ---------------------------------------------------------------------------
## AgeGrp
## n missing distinct
## 870 0 5
##
## lowest : 18-24 25-34 35-44 46-54 55+ , highest: 18-24 25-34 35-44 46-54 55+
##
## Value 18-24 25-34 35-44 46-54 55+
## Frequency 55 336 296 143 40
## Proportion 0.063 0.386 0.340 0.164 0.046
## ---------------------------------------------------------------------------
## AttrNum
## n missing distinct Info Sum Mean Gmd
## 870 0 2 0.405 140 0.1609 0.2704
##
## ---------------------------------------------------------------------------
## GenNum
## n missing distinct Info Sum Mean Gmd
## 870 0 2 0.724 516 0.5931 0.4832
##
## ---------------------------------------------------------------------------
## TravLevel
## n missing distinct Info Mean Gmd
## 870 0 3 0.634 1.074 0.4906
##
## Value 0 1 2
## Frequency 94 618 158
## Proportion 0.108 0.710 0.182
## ---------------------------------------------------------------------------
## OTNum
## n missing distinct Info Sum Mean Gmd
## 870 0 2 0.617 252 0.2897 0.412
##
## ---------------------------------------------------------------------------
## MariStatNum
## n missing distinct Info Mean Gmd
## 870 0 3 0.855 0.9103 0.7708
##
## Value 0 1 2
## Frequency 269 410 191
## Proportion 0.309 0.471 0.220
## ---------------------------------------------------------------------------
## EducNum
## n missing distinct Info Mean Gmd
## 870 0 6 0.898 1.268 1.359
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 270 358 100 75 15 52
## Proportion 0.310 0.411 0.115 0.086 0.017 0.060
## ---------------------------------------------------------------------------
## JRoleNum
## n missing distinct Info Mean Gmd
## 870 0 9 0.972 3.664 2.576
##
## lowest : 0 1 2 3 4, highest: 4 5 6 7 8
##
## Value 0 1 2 3 4 5 6 7 8
## Frequency 51 87 200 172 53 76 51 153 27
## Proportion 0.059 0.100 0.230 0.198 0.061 0.087 0.059 0.176 0.031
## ---------------------------------------------------------------------------
plot_intro(attritdf)
plot_missing(attritdf)
plot_bar(attritdf)
plot_histogram(attritdf)
plot_qq(attritdf)
# Correlationapalooza
plot_correlation(allnumeric, type = c("all", "discrete", "continuous"),
maxcat = 20L, cor_args = list(), geom_text_args = list(),
title = NULL, ggtheme = theme_gray(),
theme_config = list(legend.position = "bottom", axis.text.x =
element_text(angle = 90)))
# Correlation with just continuous variables
attritcor1 <- attritdf%>%dplyr::select(NumCompaniesWorked,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,YearsAtCompany)
plot_correlation(attritcor1)
attrcont <- attritdf%>%dplyr::select(Attrition,Age,NumCompaniesWorked,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,YearsAtCompany,MonthlyIncome,TotalWorkingYears)
attrcont %>%
filter(Attrition == "Yes") %>%
select_if(is.numeric) %>%
cor() %>%
corrplot::corrplot()
attritdf %>%
dplyr::select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate, YearsAtCompany, YearsWithCurrManager, YearsSinceLastPromotion) %>%
gather(metric, value) %>%
ggplot(aes(value, fill = metric)) +
geom_density(show.legend = FALSE) +
facet_wrap(~ metric, scales = "free")
attritdf %>%
dplyr::select(Age, DailyRate, DistanceFromHome, HourlyRate, MonthlyIncome, MonthlyRate, YearsAtCompany, YearsWithCurrManager, YearsSinceLastPromotion) %>%
gather(metric, value) %>%
ggplot(aes(value, fill = metric)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(~ metric, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_scatterplot(attritdf, by="MonthlyIncome")
plot_scatterplot(attritdf, by="YearsAtCompany")
plot_scatterplot(attritdf, by="YearsInCurrentRole")
plot_scatterplot(attritdf, by="YearsSinceLastPromotion")
plot_scatterplot(attritdf, by="YearsWithCurrManager")
plot_scatterplot(attritdf, by="NumCompaniesWorked")
attritdf%>%ggplot(aes(x=MonthlyIncome,YearsAtCompany,color=AgeGrp))+geom_point()
attritdf%>%ggplot(aes(x=MonthlyIncome,YearsAtCompany,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
attritdf%>%ggplot(aes(x=YearsSinceLastPromotion,PercentSalaryHike,color=AgeGrp))+geom_point()
attritdf%>%ggplot(aes(x=YearsSinceLastPromotion,PercentSalaryHike,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.015
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.015
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.1096e-015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.05
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.05
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 1.1096e-015
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 4
attritdf%>%ggplot(aes(x=YearsWithCurrManager,MonthlyIncome,color=AgeGrp))+geom_point()
attritdf%>%ggplot(aes(x=YearsWithCurrManager,MonthlyIncome,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 1
attritdf%>%ggplot(aes(x=YearsAtCompany,MonthlyIncome,color=AgeGrp))+geom_point()
attritdf%>%ggplot(aes(x=YearsAtCompany,MonthlyIncome,color=AgeGrp))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Attrition by Age Group
AgeGrpdf <- attritdf%>%dplyr::select(AgeGrp,Attrition)
AgeGrpdf <- AgeGrpdf%>%filter(Attrition=="Yes")
AgeGrpplot <- ggplot(AgeGrpdf,aes(x=AgeGrp,group=Attrition,color=AgeGrp))+geom_bar(aes(y = ..prop..,fill=factor(..x..)), stat="count")+ggtitle("Attrition by Age Group")+
geom_text(aes(label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.1)+labs(y = "Percent")+ylab(NULL)+xlab("Age groups")
AgeGrpplot
AgeJLev <- attritdf%>%dplyr::select(JobLevel,Attrition)
AgeJLev <- AgeJLev%>%filter(Attrition=="Yes")
AgeJLevplot <- ggplot(AgeJLev,aes(x=JobLevel,group=Attrition,color=JobLevel))+geom_bar(aes(y = ..prop..,fill=factor(..x..)), stat="count")+ggtitle("Attrition by Job Level")+
geom_text(aes(label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.1)+labs(y = "Percent")+ylab(NULL)+xlab("Job Levels")
AgeJLevplot
tempdf <- attritdf%>%dplyr::select(JobRole,Attrition,MonthlyIncome)%>%group_by(JobRole)
jrno <- tempdf%>%filter(Attrition=="Yes")
jrno <- jrno%>%summarise(AttrCnt=n())
jry <- tempdf%>%filter(Attrition=="No")
jry <- tempdf%>%summarise(NoAttrCnt=n())
AttrJobRoledf <- merge(jry,jrno)
AttrJobRoledf <- AttrJobRoledf %>% mutate(JRole = case_when(
JobRole == "Healthcare Representative" ~ "HC_Rep",
JobRole == "Human Resources" ~ "HR",
JobRole == "Laboratory Technician" ~ "Lab_Tech",
JobRole == "Manager" ~ "Mgr",
JobRole == "Manufacturing Director" ~ "Manfact_Dir",
JobRole == "Research Director" ~ "Re_Dir",
JobRole == "Research Scientist" ~ "Re_Scientist",
JobRole == "Sales Executive" ~ "SalesExec",
JobRole == "Sales Representative" ~ "SalesRep"
))
ggplot(tempdf,aes(x=NULL,y=MonthlyIncome,fill=JobRole))+geom_boxplot()+ggtitle("Summary statistics by Monthly Income and Job Role")+ylab("Monthly Income")+xlab("Job Role")
attritdf%>%ggplot()+geom_bar(aes(y=JobSatisfaction,x=JobRole,fill=JobSatisfaction),stat="identity",size=4)+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ylab(NULL)+xlab(NULL)+ggtitle("Job Satisfaction Scores by Job Role")
ggplot(AttrJobRoledf,aes(x=JobRole,y=AttrCnt))+geom_point()+ylab("Attrition Count")+xlab("Job Role")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
# MonthlyIncome by job role
IncomeJobRolePlot <- ggplot(tempdf,aes(JobRole, MonthlyIncome,fill=JobRole))+geom_bar(stat = "identity",position = position_stack(reverse = TRUE))+coord_flip()
IncomeJobRolePlot
# Continuous variable relationship analysis
# MonthlyIncome,YearsAtCompany,PercentSalaryHike,NumCompaniesWorked,TotalWorkingYears,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
# Attrition by gender
AttritReduced <- attritdf%>%dplyr::select(Attrition,Age,AgeGrp,Gender,YearsSinceLastPromotion,YearsWithCurrManager,JobRole,)%>%group_by(Attrition="Yes")
AttritReduced%>%ggplot(aes(x=Gender,fill=Gender))+geom_bar()+ggtitle("Bar chart of Attrition by Gender")+xlab(NULL)+ylab(NULL)+geom_text(stat = "count",aes(label=..count..))+theme(legend.position = "none")
attritdf%>%ggplot(aes(x=Gender,fill=Gender))+geom_bar()+ggtitle("Bar chart by Gender")+xlab(NULL)+ylab(NULL)+geom_text(stat = "count",aes(label=..count..))+theme(legend.position = "none")
set.seed(123)
training.samples <- attritdf$AttrNum %>%
createDataPartition(p=0.8, list = FALSE)
train.data <- attritdf[training.samples, ]
test.data <- attritdf[-training.samples, ]
attritdflog <- attritdf%>%dplyr::select(MonthlyIncome,Attrition,AttrNum,Age,GenNum,YearsInCurrentRole,TotalWorkingYears,TrainingTimesLastYear,YearsSinceLastPromotion,JobSatisfaction,PerformanceRating,RelationshipSatisfaction,EnvironmentSatisfaction,DistanceFromHome,StockOptionLevel,NumCompaniesWorked,JobInvolvement,JRoleNum,EducNum,OTNum,TravLevel,MariStatNum)%>%mutate(logYSLP = log(YearsSinceLastPromotion),logDFH=log(DistanceFromHome),logNCW=log(NumCompaniesWorked),logTWY=log(TotalWorkingYears),sqrYSLP=(YearsSinceLastPromotion)^2)
# Initial model with most variables that made sense
model1<-lm(AttrNum~Age+GenNum+YearsInCurrentRole+TotalWorkingYears+TrainingTimesLastYear+YearsSinceLastPromotion+JobSatisfaction+PerformanceRating+RelationshipSatisfaction+EnvironmentSatisfaction+DistanceFromHome+StockOptionLevel+NumCompaniesWorked+JobInvolvement+JRoleNum+EducNum+OTNum+TravLevel+MariStatNum,data=attritdflog)
outlierTest(model1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 485 3.974266 7.6709e-05 0.066737
qqPlot(model1,main = "QQ Plot")
## Warning in rlm.default(x, y, weights, method = method, wt.method =
## wt.method, : 'rlm' failed to converge in 20 steps
## [1] 485 860
leveragePlots(model1,main = "Leverage Plots")
spreadLevelPlot(model1)
## Warning in spreadLevelPlot.lm(model1):
## 183 negative fitted values removed
##
## Suggested power transformation: -0.0007314523
ncvTest(model1)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 161.8543, Df = 1, p = < 2.22e-16
crPlots(model1)
mod1resid = rstudent(model1)
hist(mod1resid,col="red",main = "Histogram of residuals for initial model",ylab = NULL,xlab = NULL)
plot(model1)
summary(model1)
##
## Call:
## lm(formula = AttrNum ~ Age + GenNum + YearsInCurrentRole + TotalWorkingYears +
## TrainingTimesLastYear + YearsSinceLastPromotion + JobSatisfaction +
## PerformanceRating + RelationshipSatisfaction + EnvironmentSatisfaction +
## DistanceFromHome + StockOptionLevel + NumCompaniesWorked +
## JobInvolvement + JRoleNum + EducNum + OTNum + TravLevel +
## MariStatNum, data = attritdflog)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62624 -0.20117 -0.07257 0.08974 1.21118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8015850 0.1056067 7.590 8.53e-14 ***
## Age -0.0033755 0.0016452 -2.052 0.040505 *
## GenNum 0.0175231 0.0222197 0.789 0.430553
## YearsInCurrentRole -0.0096504 0.0039194 -2.462 0.014011 *
## TotalWorkingYears -0.0072845 0.0022859 -3.187 0.001492 **
## TrainingTimesLastYear1 -0.2070113 0.0786496 -2.632 0.008643 **
## TrainingTimesLastYear2 -0.1360809 0.0613569 -2.218 0.026833 *
## TrainingTimesLastYear3 -0.1652211 0.0613815 -2.692 0.007251 **
## TrainingTimesLastYear4 -0.1308795 0.0698904 -1.873 0.061468 .
## TrainingTimesLastYear5 -0.1732064 0.0692364 -2.502 0.012551 *
## TrainingTimesLastYear6 -0.2042315 0.0794925 -2.569 0.010366 *
## YearsSinceLastPromotion 0.0176616 0.0042659 4.140 3.82e-05 ***
## JobSatisfaction2 -0.0292029 0.0343761 -0.850 0.395839
## JobSatisfaction3 -0.0613269 0.0313729 -1.955 0.050943 .
## JobSatisfaction4 -0.1328299 0.0309672 -4.289 2.00e-05 ***
## PerformanceRating4 0.0205147 0.0301432 0.681 0.496329
## RelationshipSatisfaction2 -0.0675254 0.0344835 -1.958 0.050540 .
## RelationshipSatisfaction3 -0.0771347 0.0313699 -2.459 0.014139 *
## RelationshipSatisfaction4 -0.0869954 0.0312165 -2.787 0.005443 **
## EnvironmentSatisfaction2 -0.1340363 0.0343595 -3.901 0.000103 ***
## EnvironmentSatisfaction3 -0.1181239 0.0315743 -3.741 0.000196 ***
## EnvironmentSatisfaction4 -0.1188642 0.0314784 -3.776 0.000171 ***
## DistanceFromHome 0.0041932 0.0013440 3.120 0.001871 **
## StockOptionLevel1 -0.1045102 0.0334239 -3.127 0.001828 **
## StockOptionLevel2 -0.1425094 0.0470130 -3.031 0.002510 **
## StockOptionLevel3 -0.0004506 0.0549156 -0.008 0.993455
## NumCompaniesWorked 0.0178394 0.0047256 3.775 0.000171 ***
## JobInvolvement2 -0.2255851 0.0511692 -4.409 1.18e-05 ***
## JobInvolvement3 -0.2783164 0.0488008 -5.703 1.63e-08 ***
## JobInvolvement4 -0.3223167 0.0588934 -5.473 5.86e-08 ***
## JRoleNum 0.0111032 0.0048505 2.289 0.022324 *
## EducNum 0.0155647 0.0081165 1.918 0.055496 .
## OTNum 0.2078221 0.0241489 8.606 < 2e-16 ***
## TravLevel 0.0582844 0.0205202 2.840 0.004616 **
## MariStatNum -0.0353712 0.0228305 -1.549 0.121689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3158 on 835 degrees of freedom
## Multiple R-squared: 0.291, Adjusted R-squared: 0.2621
## F-statistic: 10.08 on 34 and 835 DF, p-value: < 2.2e-16
car::vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.878618 1 1.370627
## GenNum 1.039202 1 1.019413
## YearsInCurrentRole 1.772554 1 1.331373
## TotalWorkingYears 2.569938 1 1.603103
## TrainingTimesLastYear 1.217523 6 1.016537
## YearsSinceLastPromotion 1.609147 1 1.268522
## JobSatisfaction 1.095869 3 1.015375
## PerformanceRating 1.019952 1 1.009927
## RelationshipSatisfaction 1.094314 3 1.015135
## EnvironmentSatisfaction 1.087170 3 1.014027
## DistanceFromHome 1.041902 1 1.020736
## StockOptionLevel 2.550517 3 1.168884
## NumCompaniesWorked 1.235894 1 1.111708
## JobInvolvement 1.090603 3 1.014560
## JRoleNum 1.076026 1 1.037317
## EducNum 1.039921 1 1.019765
## OTNum 1.046537 1 1.023004
## TravLevel 1.043908 1 1.021718
## MariStatNum 2.367149 1 1.538554
# model after looking at p-values of multi-linear regression and their VIF scores
ggplot(attritdf,aes(sample=YearsSinceLastPromotion,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - YearsSinceLastPromotionw with Attrition")+xlab(NULL)+ylab(NULL)
#ggplot(attritdflog,aes(sample=logYSLP,colour=Attrition))+stat_qq()+stat_qq_line()
ggplot(attritdf,aes(sample=DistanceFromHome,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - DistanceFromHome with Attrition")+xlab(NULL)+ylab(NULL)
#ggplot(attritdflog,aes(sample=logDFH,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - log transformed DistanceFromHome with Attrition")
ggplot(attritdf,aes(sample=NumCompaniesWorked,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - NumCompaniesWorked with Attrition")+xlab(NULL)+ylab(NULL)
#ggplot(attritdflog,aes(sample=logNCW,colour=AttrNum))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - log transformed NumCompaniesWorked with Attrition")
ggplot(attritdflog,aes(sample=TotalWorkingYears,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - TotalWorkingYears with Attrition")+xlab(NULL)+ylab(NULL)
#ggplot(attritdflog,aes(sample=logTWY,colour=Attrition))+stat_qq()+stat_qq_line()+ggtitle("QQPlot - log transformed TotalWorkingYears with Attrition")
model2 <- lm(AttrNum~YearsSinceLastPromotion+JobSatisfaction+EnvironmentSatisfaction+DistanceFromHome+NumCompaniesWorked+JobInvolvement+OTNum, data=attritdflog)
summary(model2)
##
## Call:
## lm(formula = AttrNum ~ YearsSinceLastPromotion + JobSatisfaction +
## EnvironmentSatisfaction + DistanceFromHome + NumCompaniesWorked +
## JobInvolvement + OTNum, data = attritdflog)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60343 -0.18719 -0.08282 0.00263 1.03786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4782913 0.0646736 7.395 3.36e-13 ***
## YearsSinceLastPromotion -0.0001114 0.0036479 -0.031 0.975655
## JobSatisfaction2 -0.0191338 0.0367817 -0.520 0.603059
## JobSatisfaction3 -0.0457316 0.0333490 -1.371 0.170640
## JobSatisfaction4 -0.1206412 0.0329505 -3.661 0.000266 ***
## EnvironmentSatisfaction2 -0.1166118 0.0366180 -3.185 0.001502 **
## EnvironmentSatisfaction3 -0.1178138 0.0336459 -3.502 0.000486 ***
## EnvironmentSatisfaction4 -0.1086244 0.0335426 -3.238 0.001248 **
## DistanceFromHome 0.0029613 0.0014275 2.075 0.038329 *
## NumCompaniesWorked 0.0077692 0.0046150 1.683 0.092647 .
## JobInvolvement2 -0.2500019 0.0547267 -4.568 5.64e-06 ***
## JobInvolvement3 -0.3133234 0.0520614 -6.018 2.61e-09 ***
## JobInvolvement4 -0.3557686 0.0627190 -5.672 1.93e-08 ***
## OTNum 0.2204916 0.0256488 8.597 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3403 on 856 degrees of freedom
## Multiple R-squared: 0.1562, Adjusted R-squared: 0.1434
## F-statistic: 12.19 on 13 and 856 DF, p-value: < 2.2e-16
car::vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## YearsSinceLastPromotion 1.013628 1 1.006791
## JobSatisfaction 1.026316 3 1.004339
## EnvironmentSatisfaction 1.032081 3 1.005277
## DistanceFromHome 1.012403 1 1.006183
## NumCompaniesWorked 1.015355 1 1.007648
## JobInvolvement 1.023754 3 1.003920
## OTNum 1.016974 1 1.008451
plot(model2)
mod2resid = rstudent(model2)
hist(mod2resid,col="blue",main = "Histogram of residuals of final model",ylab = NULL,xlab = NULL)
attritdf%>%ggplot(aes(x=YearsAtCompany,MonthlyIncome,color=Gender))+geom_point()
attritdf%>%ggplot(aes(x=YearsAtCompany,MonthlyIncome,color=Gender))+geom_smooth()+ggtitle("Monthly income over years with company - M vs F")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
attritdf%>%ggplot(aes(x=YearsWithCurrManager,MonthlyIncome,color=Gender))+geom_point()
attritdf%>%ggplot(aes(x=YearsWithCurrManager,MonthlyIncome,color=Gender))+geom_smooth()+ggtitle("Comparison of Income by Gender over time with same manager")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
attritdf%>%ggplot(aes(x=Age,MonthlyIncome,color=Gender))+geom_point()
attritdf%>%ggplot(aes(x=Age,MonthlyIncome,color=Gender))+geom_smooth()+ggtitle("Monthly Income and Age - M vs F")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(attritdf,aes(sample=MonthlyIncome,colour=factor(Gender)))+stat_qq()+stat_qq_line()
genNCWviolin <- ggplot(attritdf,aes(x=JobRole,y=NumCompaniesWorked,fill=Gender))+geom_violin()+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("Number of Companies worked by Job Role and Gender")
genNCWviolin
genYACviolin <- ggplot(attritdf, aes(x=JobRole, y=YearsAtCompany,fill=Gender))+geom_violin()+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("Years At the Company by Job Role and Gender")
genYACviolin
genNCW <- ggplot(attritdf, aes(x=JobRole,y=MonthlyIncome,fill=Gender))+geom_col(position = "dodge2")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
genNCW+ggtitle("Income by Job Role and Gender")
set.seed(123)
perc=.8
numks = 5
iterations=100
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSensitivity = matrix(nrow = iterations)
masterSpecificity = matrix(nrow = iterations)
for(j in 1:iterations)
{
accs = data.frame(accuracy=numeric(numks),k=numeric(numks))
trainIndices = sample(1:dim(attritdf)[1],
round(perc * dim(attritdf)[1]))
train = attritdf[trainIndices,]
test = attritdf[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,c(2,12,15,16,18,22,25)],test[,c(2,12,15,16,18,22,25)],train$AttrNum,prob = TRUE, k = i)
table(classifications,test$AttrNum)
CM = confusionMatrix(table(classifications,test$AttrNum))
masterAcc[j,i] = CM$overall[1]
masterSensitivity[j]=CM$byClass[1]
masterSpecificity[j]=CM$byClass[2]
}
}
ptitle=print(paste0(numks," values of K for ",iterations," iterations."))
## [1] "5 values of K for 100 iterations."
MeanAcc = colMeans(masterAcc)
MeanSpec = mean(masterSpecificity)
MeanSens = mean(masterSensitivity)
plot(seq(1,numks,1),MeanAcc, type = "l",xlab = "Values of K",ylab = "Accuracy",main = ptitle)
which.max(MeanAcc)
## [1] 5
CM
## Confusion Matrix and Statistics
##
##
## classifications 0 1
## 0 142 23
## 1 5 4
##
## Accuracy : 0.8391
## 95% CI : (0.7759, 0.8903)
## No Information Rate : 0.8448
## P-Value [Acc > NIR] : 0.631416
##
## Kappa : 0.1568
##
## Mcnemar's Test P-Value : 0.001315
##
## Sensitivity : 0.9660
## Specificity : 0.1481
## Pos Pred Value : 0.8606
## Neg Pred Value : 0.4444
## Prevalence : 0.8448
## Detection Rate : 0.8161
## Detection Prevalence : 0.9483
## Balanced Accuracy : 0.5571
##
## 'Positive' Class : 0
##
set.seed(123)
perc=.8
numks = 5
vars = c("")
iterations=20
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSensitivity = matrix(nrow = iterations)
masterSpecificity = matrix(nrow = iterations)
for(i in 1:iterations)
{
rn=sample(1:30,1)
trainIndices = sample(1:dim(attritdf)[1],round(perc * dim(attritdf)[1]))
train = attritdf[trainIndices,]
test = attritdf[-trainIndices,]
model = naiveBayes(train[,c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)],as.factor(train$Attrition),laplace = 1)
table(predict(model,test[,c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)]),as.factor(test$Attrition))
CM = confusionMatrix(table(predict(model,test[,c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)]),as.factor(test$Attrition)))
masterAcc[j]=CM$overall[1]
masterSensitivity[i]=CM$byClass[1]
masterSpecificity[i]=CM$byClass[1]
}
# Adjusted for noattritiondf
# c(2,5,11,14,15,16,17,19,23,24,26,28,29,30,32,33)
tmpdf <- attritdf[c(2,6,12,15,16,17,18,20,24,25,27,29,30,31,33,34)]
tmpdf2 <- noattritdf[c(2,5,11,14,15,16,17,19,23,24,26,28,29,30,32,33)]
attritOut <- predict(model,tmpdf2)
myout=cbind.data.frame(noattritdf$ID,attritOut)
colnames(myout) <- c("ID","Attrition")
write.csv(myout, file = "/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/Case2PredictionsMalcolmCarlsonAttrition.csv",row.names = FALSE)
plot(attritOut, col="darkgreen", main = "Plot of Attrition Predictions")
CM
## Confusion Matrix and Statistics
##
##
## No Yes
## No 125 12
## Yes 14 23
##
## Accuracy : 0.8506
## 95% CI : (0.7888, 0.9)
## No Information Rate : 0.7989
## P-Value [Acc > NIR] : 0.05026
##
## Kappa : 0.5448
##
## Mcnemar's Test P-Value : 0.84452
##
## Sensitivity : 0.8993
## Specificity : 0.6571
## Pos Pred Value : 0.9124
## Neg Pred Value : 0.6216
## Prevalence : 0.7989
## Detection Rate : 0.7184
## Detection Prevalence : 0.7874
## Balanced Accuracy : 0.7782
##
## 'Positive' Class : No
##
set.seed(123)
training.samples <- attritdf$MonthlyIncome %>%
createDataPartition(p=0.8, list = FALSE)
train.data <- attritdf[training.samples, ]
test.data <- attritdf[-training.samples, ]
modelSalary<-lm(MonthlyIncome~JobLevel+JobRole+
TotalWorkingYears,data=attritdf)
pairs(~MonthlyIncome+JobLevel+TotalWorkingYears,data=attritdf)
plot(modelSalary)
car::vif(modelSalary)
## GVIF Df GVIF^(1/(2*Df))
## JobLevel 15.167434 4 1.404798
## JobRole 9.813688 8 1.153425
## TotalWorkingYears 2.918135 1 1.708255
mod1resid = rstudent(modelSalary)
hist(mod1resid,col="darkgreen",main = "Histogram of residuals for model",ylab = NULL,xlab = NULL)
predictions <- modelSalary %>% predict(test.data)
data.frame( R2 = R2(predictions, test.data$MonthlyIncome),
RMSE = RMSE(predictions, test.data$MonthlyIncome),
MAE = MAE(predictions, test.data$MonthlyIncome))
## R2 RMSE MAE
## 1 0.947316 1034.706 754.5801
summary(modelSalary)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears,
## data = attritdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3184.4 -622.9 -83.6 623.4 4282.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3620.912 180.824 20.025 < 2e-16 ***
## JobLevel2 1687.533 139.069 12.134 < 2e-16 ***
## JobLevel3 4913.569 186.878 26.293 < 2e-16 ***
## JobLevel4 8246.787 282.379 29.205 < 2e-16 ***
## JobLevel5 11003.473 332.523 33.091 < 2e-16 ***
## JobRoleHuman Resources -1106.211 252.764 -4.376 1.36e-05 ***
## JobRoleLaboratory Technician -1243.528 175.324 -7.093 2.75e-12 ***
## JobRoleManager 3332.134 237.808 14.012 < 2e-16 ***
## JobRoleManufacturing Director 134.719 158.506 0.850 0.396
## JobRoleResearch Director 3479.341 211.496 16.451 < 2e-16 ***
## JobRoleResearch Scientist -1040.524 178.715 -5.822 8.21e-09 ***
## JobRoleSales Executive -13.146 136.850 -0.096 0.923
## JobRoleSales Representative -1267.454 220.601 -5.745 1.27e-08 ***
## TotalWorkingYears 45.975 7.768 5.918 4.70e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1007 on 856 degrees of freedom
## Multiple R-squared: 0.9527, Adjusted R-squared: 0.952
## F-statistic: 1327 on 13 and 856 DF, p-value: < 2.2e-16
salaryPred <- modelSalary%>% predict(nosalarydf)
myout <- cbind.data.frame(nosalarydf$ID,salaryPred)
colnames(myout) <- c("ID","MonthlyIncome")
write.csv(myout, file = "/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/Case2PredictionsMalcolmCarlsonSalary.csv",row.names = FALSE)
attritdf2 <- attritdf%>%dplyr::select(Age,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager)
# Fit the full model
set.seed(123)
full.model <- lm(MonthlyIncome ~., data = attritdf2)
# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both",
trace = FALSE)
pairs(~MonthlyIncome+JobLevel+TotalWorkingYears,data=attritdf)
plot(step.model)
car::vif(step.model)
## GVIF Df GVIF^(1/(2*Df))
## DailyRate 1.011976 1 1.005970
## JobLevel 15.236178 4 1.405593
## JobRole 9.868255 8 1.153825
## TotalWorkingYears 2.918173 1 1.708266
step.modelid = rstudent(step.model)
hist(step.modelid,col="darkgreen",main = "Histogram of residuals for model",ylab = NULL,xlab = NULL)
predictions <- step.model %>% predict(test.data)
data.frame( R2 = R2(predictions, test.data$MonthlyIncome),
RMSE = RMSE(predictions, test.data$MonthlyIncome),
MAE = MAE(predictions, test.data$MonthlyIncome))
## R2 RMSE MAE
## 1 0.9475298 1033.253 750.574
summary(step.model)
##
## Call:
## lm(formula = MonthlyIncome ~ DailyRate + JobLevel + JobRole +
## TotalWorkingYears, data = attritdf2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3256.9 -624.8 -84.8 596.6 4173.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.478e+03 1.939e+02 17.934 < 2e-16 ***
## DailyRate 1.728e-01 8.554e-02 2.020 0.0437 *
## JobLevel2 1.681e+03 1.389e+02 12.103 < 2e-16 ***
## JobLevel3 4.907e+03 1.866e+02 26.303 < 2e-16 ***
## JobLevel4 8.246e+03 2.819e+02 29.253 < 2e-16 ***
## JobLevel5 1.097e+04 3.322e+02 33.034 < 2e-16 ***
## JobRoleHuman Resources -1.101e+03 2.523e+02 -4.362 1.45e-05 ***
## JobRoleLaboratory Technician -1.236e+03 1.750e+02 -7.060 3.44e-12 ***
## JobRoleManager 3.367e+03 2.380e+02 14.146 < 2e-16 ***
## JobRoleManufacturing Director 1.410e+02 1.583e+02 0.891 0.3732
## JobRoleResearch Director 3.497e+03 2.113e+02 16.551 < 2e-16 ***
## JobRoleResearch Scientist -1.035e+03 1.784e+02 -5.802 9.23e-09 ***
## JobRoleSales Executive -8.351e+00 1.366e+02 -0.061 0.9513
## JobRoleSales Representative -1.263e+03 2.202e+02 -5.734 1.36e-08 ***
## TotalWorkingYears 4.592e+01 7.754e+00 5.921 4.62e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1005 on 855 degrees of freedom
## Multiple R-squared: 0.9529, Adjusted R-squared: 0.9522
## F-statistic: 1237 on 14 and 855 DF, p-value: < 2.2e-16
car::vif(step.model)
## GVIF Df GVIF^(1/(2*Df))
## DailyRate 1.011976 1 1.005970
## JobLevel 15.236178 4 1.405593
## JobRole 9.868255 8 1.153825
## TotalWorkingYears 2.918173 1 1.708266
salaryPred <- step.model%>% predict(nosalarydf)
myout <- cbind.data.frame(nosalarydf$ID,salaryPred)
colnames(myout) <- c("ID","MonthlyIncome")
write.csv(myout, file = "/Users/malco/Documents/MDS/MDS_6306_DDS/DDS-CaseStudy2/Case2PredictionsMalcolmCarlsonSalary.csv",row.names = FALSE)